{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "from sklearn.externals import joblib\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "\n",
    "from sklearn import tree\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "DIR = 'PATH/TO/YOUR/DATA'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = pd.read_csv(os.path.join(DIR,'files/unzipped_data/application_train.csv'))\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_ext = X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'TARGET']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# X_ext = X_ext.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "        for function_name in ['nanmin', 'nanmax', 'sum', 'mean', 'var', 'median', 'std', 'nanmedian', 'nanmean', 'min', 'max']:\n",
    "            X_ext['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(\n",
    "                X_ext[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_ext.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_ext['EXT_SRC_weighted3'] = (X.EXT_SOURCE_1*2+X.EXT_SOURCE_2*3+X.EXT_SOURCE_3*4)/9\n",
    "X_ext['EXT_SRC_weighted2'] = (X.EXT_SOURCE_1*3+X.EXT_SOURCE_2*4+X.EXT_SOURCE_3*2)/9\n",
    "X_ext['EXT_SRC_weighted1'] = (X.EXT_SOURCE_1*4+X.EXT_SOURCE_2*2+X.EXT_SOURCE_3*3)/9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_ext.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_ext_corr = abs(X_ext.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_ext_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(X_ext_corr, \n",
    "            xticklabels=X_ext_corr.columns,\n",
    "            yticklabels=X_ext_corr.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test = train_test_split(X_ext)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_train = X_train['TARGET']\n",
    "Y_test = X_test['TARGET']\n",
    "\n",
    "X_train = X_train.drop(columns='TARGET')\n",
    "X_test = X_test.drop(columns='TARGET')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = X_train.fillna(0)\n",
    "X_test = X_test.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = tree.DecisionTreeClassifier()\n",
    "clf.fit(X_train, Y_train)\n",
    "\n",
    "print(\"R^2 on the train set:\")\n",
    "print(clf.score(X_train, Y_train))\n",
    "\n",
    "print(\"\\nR^2 on the test set:\")\n",
    "print(clf.score(X_test, Y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_importances = pd.Series(clf.feature_importances_, index=X_train.columns.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_importances.sort_values(ascending=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
